######################################################################################
# Project:  Complexity Project
#
# Task:     Compare FK scores against other measures of syntactic complexity
#
# Author:   Martijn Schoonvelde
# Date:     November 2021
#
######################################################################################

#load libraries

rm(list = ls())

library(quanteda)
library(stringr)
library(tidyverse)


#load data
load("Data/Bundestag_Sample.Rdata")
load("Data/Congresso_Sample.Rdata")
load("Data/TweedeKamer_Sample.Rdata")
load("Data/HouseofCommons_Sample.Rdata")
load("Data/euspeech_validation.Rdata")
load("Data/congress_dutch_validation.Rdata")

tk.corpus.sample <- tk.corpus.sample %>%
  rename(flesh = flesch, 
         flesh.grade = flesch.grade)


bt.corpus.sample$X <- NULL
tk.corpus.sample$X <- NULL
cong.corpus.sample$X <- NULL
hoc.corpus.sample$X <- NULL

euspeech.corpus$parliament <- "Heads of Government"
euspeech.corpus$terms <- NA
euspeech.corpus$yearmonth <- NA
euspeech.corpus$cmp <- NA
euspeech.corpus$cabinet.party <- NA
euspeech.corpus$document <- NULL

hoc.corpus.sample <- as.data.frame(hoc.corpus.sample)

'%!in%' <- function(x,y)!('%in%'(x,y))

vars <- names(speeches_dutch) %in% names(hoc.corpus.sample)
speeches_dutch <- speeches_dutch[,vars]

which(names(hoc.corpus.sample) %!in% names(speeches_dutch))

speeches_dutch$date <- NA
speeches_dutch$parliament <- "Dutch Congress Speeches"
speeches_dutch$terms <- NA
speeches_dutch$yearmonth <- NA
speeches_dutch$cmp <- NA
speeches_dutch$cabinet.party <- NA


euspeech.corpus <- euspeech.corpus %>% 
  rename(
    document = flesch.kincaid,
  ) %>%
  mutate(document = NA)


#custom scale function

scale_this <- function(x, na.rm = FALSE) (x - mean(x, na.rm = na.rm)) / sd(x, na.rm)


#syntactic variables

syntactic_variables <- c("ARI_scale", "avg_sl_scale", "syntactic_dependency_scale", "syntactic_depth_scale")
syntactic_variables_minimal <- c("ARI_scale", "avg_sl_scale")

#syntactic scale Congresso

cong.corpus.sample$ARI_scale <- scale_this(cong.corpus.sample$ARI)
cong.corpus.sample$avg_sl_scale  <- scale_this(cong.corpus.sample$avg_sl)
cong.corpus.sample$syntactic_dependency_scale <- scale_this(cong.corpus.sample$syntactic_dependency)
cong.corpus.sample$syntactic_depth_scale <- scale_this(cong.corpus.sample$syntactic_depth)

cong.corpus.sample$syntactic_complexity_scale <- rowMeans(cong.corpus.sample[, syntactic_variables])
cong.corpus.sample$syntactic_complexity_scale_minimal <- rowMeans(cong.corpus.sample[, syntactic_variables_minimal])

cong.corpus.sample$flesh_grade_scale <- scale_this(cong.corpus.sample$flesh.grade)


#syntactic scale Bundestag

bt.corpus.sample$ARI_scale <- scale_this(bt.corpus.sample$ARI)
bt.corpus.sample$avg_sl_scale  <- scale_this(bt.corpus.sample$avg_sl)
bt.corpus.sample$syntactic_dependency_scale <- scale_this(bt.corpus.sample$syntactic_dependency)
bt.corpus.sample$syntactic_depth_scale <- scale_this(bt.corpus.sample$syntactic_depth)

bt.corpus.sample$syntactic_complexity_scale <- rowMeans(bt.corpus.sample[,syntactic_variables])
bt.corpus.sample$syntactic_complexity_scale_minimal <- rowMeans(bt.corpus.sample[,syntactic_variables_minimal])
bt.corpus.sample$flesh_grade_scale <- scale_this(bt.corpus.sample$flesh.grade)


#syntactic scale Tweede Kamer

tk.corpus.sample$ARI_scale <- scale_this(tk.corpus.sample$ARI)
tk.corpus.sample$avg_sl_scale  <- scale_this(tk.corpus.sample$avg_sl)
tk.corpus.sample$syntactic_dependency_scale <- scale_this(tk.corpus.sample$syntactic_dependency)
tk.corpus.sample$syntactic_depth_scale <- scale_this(tk.corpus.sample$syntactic_depth)

tk.corpus.sample$syntactic_complexity_scale <- rowMeans(tk.corpus.sample[,syntactic_variables])
tk.corpus.sample$syntactic_complexity_scale_minimal <- rowMeans(tk.corpus.sample[,syntactic_variables_minimal])
tk.corpus.sample$flesh_grade_scale <- scale_this(tk.corpus.sample$flesh.grade)


#syntactic scale House of Commons

hoc.corpus.sample$ARI_scale <- scale_this(hoc.corpus.sample$ARI)
hoc.corpus.sample$avg_sl_scale  <- scale_this(hoc.corpus.sample$avg_sl)
hoc.corpus.sample$syntactic_dependency_scale <- scale_this(hoc.corpus.sample$syntactic_dependency)
hoc.corpus.sample$syntactic_depth_scale <- scale_this(hoc.corpus.sample$syntactic_depth)

hoc.corpus.sample$syntactic_complexity_scale <- rowMeans(hoc.corpus.sample[,syntactic_variables])
hoc.corpus.sample$syntactic_complexity_scale_minimal <- rowMeans(hoc.corpus.sample[,syntactic_variables_minimal])
hoc.corpus.sample$flesh_grade_scale <- scale_this(hoc.corpus.sample$flesh.grade)


#syntactic scale EUSpeech

euspeech.corpus$ARI_scale <- scale_this(euspeech.corpus$ARI)
euspeech.corpus$avg_sl_scale  <- scale_this(euspeech.corpus$avg_sl)
euspeech.corpus$syntactic_dependency_scale <- scale_this(euspeech.corpus$syntactic_dependency)
euspeech.corpus$syntactic_depth_scale <- scale_this(euspeech.corpus$syntactic_depth)

euspeech.corpus$syntactic_complexity_scale <- rowMeans(euspeech.corpus[,syntactic_variables])
euspeech.corpus$syntactic_complexity_scale_minimal <- rowMeans(euspeech.corpus[,syntactic_variables_minimal])
euspeech.corpus$flesh_grade_scale <- scale_this(euspeech.corpus$flesh.grade)


#syntactic scale Congress Dutch

speeches_dutch$ARI_scale <- scale_this(speeches_dutch$ARI)
speeches_dutch$avg_sl_scale  <- scale_this(speeches_dutch$avg_sl)
speeches_dutch$syntactic_dependency_scale <- scale_this(speeches_dutch$syntactic_dependency)
speeches_dutch$syntactic_depth_scale <- scale_this(speeches_dutch$syntactic_depth)

speeches_dutch$syntactic_complexity_scale <- rowMeans(speeches_dutch[,syntactic_variables])
speeches_dutch$syntactic_complexity_scale_minimal <- rowMeans(speeches_dutch[,syntactic_variables_minimal])
speeches_dutch$flesh_grade_scale <- scale_this(speeches_dutch$flesh.grade)

#save data

write.csv(cong.corpus.sample, 
          file = "Corp_Congresso_Sample_Validation.csv",
          row.names = FALSE)
write.csv(tk.corpus.sample, 
          file = "Corp_TweedeKamer_Sample_Validation.csv",
          row.names = FALSE)
write.csv(bt.corpus.sample, 
          file = "Corp_Bundestag_Sample_Validation.csv",
          row.names = FALSE)
write.csv(hoc.corpus.sample, 
          file = "Corp_HouseofCommons_Sample_Validation.csv",
          row.names = FALSE)
write.csv(euspeech.corpus, 
          file = "EUSPeech_Validation.csv",
          row.names = FALSE)
write.csv(speeches_dutch, 
          file = "Congress_Dutch_Validation.csv",
          row.names = FALSE)

corpus.sample <- rbind(bt.corpus.sample,
                       cong.corpus.sample,
                       hoc.corpus.sample,
                       tk.corpus.sample, 
                       euspeech.corpus,
                       speeches_dutch)


#plot correlations fk and syntactic complexity, 4 indicators

graphLabels <- data.frame(parliament = c("DE-Bundestag", "ES-Congresso", "NL-TweedeKamer", "UK-HoC", "Heads of Government", "Dutch Congress Speeches"),
correlation = paste("r =", c(round(cor(bt.corpus.sample$flesh_grade_scale, bt.corpus.sample$syntactic_complexity_scale), 2),
                                                       round(cor(cong.corpus.sample$flesh_grade_scale, cong.corpus.sample$syntactic_complexity_scale), 2),
                                                       round(cor(tk.corpus.sample$flesh_grade_scale, tk.corpus.sample$syntactic_complexity_scale), 2),
                                                       round(cor(hoc.corpus.sample$flesh_grade_scale, hoc.corpus.sample$syntactic_complexity_scale), 2),
                            round(cor(euspeech.corpus$flesh_grade_scale, euspeech.corpus$syntactic_complexity_scale), 2),
                             round(cor(speeches_dutch$flesh_grade_scale, speeches_dutch$syntactic_complexity_scale), 2))))

complexity_correlation_plot <- ggplot(corpus.sample, aes(flesh_grade_scale, syntactic_complexity_scale)) + 
  geom_point(pch = 21, fill = "gray25", color = "white", size = 2.5) +
  scale_x_continuous(name = "Flesch Kincaid Grade (normalized)") +
  scale_y_continuous(name = "Scale of 4 Indicators of Syntactic Complexity") +
  theme_minimal() + facet_wrap(~parliament) +
  geom_text(data = graphLabels, aes(x = 2.5, y = -2,label = correlation, color = NULL,group= NULL), inherit.aes = FALSE)

ggsave(complexity_correlation_plot, file = "syntactic_complexity_correlation.png",
       width = 10, height = 8)


#plot correlations fk and syntactic complexity, 2 indicators

graphLabels <- data.frame(parliament = c("DE-Bundestag", "ES-Congresso", "NL-TweedeKamer", "UK-HoC", "Heads of Government", "Dutch Congress Speeches"),
                          correlation = paste("r =", c(round(cor(bt.corpus.sample$flesh_grade_scale, bt.corpus.sample$syntactic_complexity_scale_minimal), 2),
                                                       round(cor(cong.corpus.sample$flesh_grade_scale, cong.corpus.sample$syntactic_complexity_scale_minimal), 2),
                                                       round(cor(tk.corpus.sample$flesh_grade_scale, tk.corpus.sample$syntactic_complexity_scale_minimal), 2),
                                                       round(cor(hoc.corpus.sample$flesh_grade_scale, hoc.corpus.sample$syntactic_complexity_scale_minimal), 2),
                                                       round(cor(euspeech.corpus$flesh_grade_scale, euspeech.corpus$syntactic_complexity_scale_minimal), 2),
                                                       round(cor(speeches_dutch$flesh_grade_scale, speeches_dutch$syntactic_complexity_scale_minimal), 2))))

complexity_correlation_plot <- ggplot(corpus.sample, aes(flesh_grade_scale, syntactic_complexity_scale_minimal)) + 
  geom_point(pch = 21, fill = "gray25", color = "white", size = 2.5) +
  scale_x_continuous(name = "Flesch Kincaid Grade (normalized)") +
  scale_y_continuous(name = "Scale of 2 Indicators of Syntactic Complexity") +
  theme_minimal() + facet_wrap(~parliament) +
  geom_text(data = graphLabels, aes(x = 2.5, y = -2,label = correlation, color = NULL,group= NULL), inherit.aes = FALSE)

ggsave(complexity_correlation_plot, file = "syntactic_complexity_2_indicators_correlation.png",
       width = 10, height = 8)

























